In [3]:
import theano
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import MDBN
In [4]:
reload(MDBN)
rna_DBN, ge_DBN, me_DBN, top_DBN =MDBN.load_network('Exp_2016-11-10_1536_run_0.npz','../MDBN_run/Run_2016-11-10_1536')
In [5]:
datafiles = MDBN.prepare_TCGA_datafiles('../data')
In [6]:
RNA_output, _ = MDBN.output_DBN(rna_DBN, datafiles['mRNA'], datadir='../data')
GE_output, _ = MDBN.output_DBN(ge_DBN, datafiles['GE'], datadir='../data')
ME_output, _ = MDBN.output_DBN(me_DBN, datafiles['ME'], datadir='../data')
In [12]:
joint_layer = np.concatenate([RNA_output, GE_output, ME_output],axis=1)
plt.imshow(joint_layer, cmap='gray',interpolation='none')
plt.axis('tight')
Out[12]:
In [30]:
top_output = top_DBN.get_output(theano.shared(joint_layer,borrow=True))
plt.imshow((top_output>0.8)*np.ones_like(top_output)-(top_output<0.2)*np.ones_like(top_output),interpolation='none',extent=[0,3,385,0])
plt.colorbar()
plt.axis('tight')
plt.xticks(np.arange(0.5,3.5,1),('0','1','2'))
Out[30]:
In [34]:
plt.imshow(top_output, interpolation='none',extent=[0,3,385,0])
plt.axis('tight')
plt.colorbar()
plt.xticks(np.arange(0.5,3.5,1),('0','1','2'))
Out[34]:
In [103]:
plt.hist(top_output)
Out[103]:
In [35]:
plt.hist(top_output[:,0])
Out[35]:
In [72]:
code = (top_output[:,0:3] > 0.5) * np.ones_like(top_output[:,0:3])
In [73]:
from utils import find_unique_classes
U = find_unique_classes(code)
cl = U[0]
In [74]:
cl
Out[74]:
In [102]:
plt.hist(cl,bins=5)
plt.xticks((0.4,1.25,2.0,2.8,3.6),('0','1','2','3','4'))
Out[102]:
In [76]:
import csv
id=[]
with open('../data/'+datafiles['mRNA']) as f:
my_csv = csv.reader(f,delimiter='\t')
id = my_csv.next()
In [77]:
stat={}
with open('../data/TCGA_Data/data_bcr_clinical_data_patient.csv') as f:
reader = csv.reader(f, delimiter='\t')
for row in reader:
patient_id=row[1]
stat[patient_id]=(row[15],row[16],row[17])
In [78]:
import re
time_list = []
event_list = []
group_list = []
print('The following case IDs were not found in clinical data')
for index, key in enumerate(id[1:]):
m = re.match('TCGA-\d+-\d+', key)
patient_id = m.group(0)
if patient_id in stat:
patient_stat = stat[patient_id]
add_group = True
try:
time_list.append(float(patient_stat[2]))
event_list.append(1)
except ValueError:
try:
time_list.append(float(patient_stat[1]))
event_list.append(0)
except ValueError:
print('No data for %s' % patient_id)
add_group = False
if add_group:
group_list.append(cl[index])
else:
print(patient_id)
In [79]:
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
kmf.fit(time_list,event_observed=event_list)
kmf.plot()
Out[79]:
In [81]:
T=np.array(time_list)
E=np.array(event_list)
ix = (np.array(group_list) == 1)
kmf.fit(T[ix], E[ix], label='group 1')
ax=kmf.plot()
for i in [4]:
ix=(np.array(group_list)==i)
kmf.fit(T[ix], E[ix], label='group %d' % i)
kmf.plot(ax=ax)
In [82]:
T=np.array(time_list)
E=np.array(event_list)
ix = (np.array(group_list) == 0)
kmf.fit(T[ix], E[ix], label='group 0')
ax=kmf.plot()
for i in range(2,4):
ix=(np.array(group_list)==i)
kmf.fit(T[ix], E[ix], label='group %d' % i)
kmf.plot(ax=ax)
In [83]:
with open('/Users/gluca/SoftwareProjects/Thesis/MDBN/MATLAB_classes.txt') as f:
matlab_classes = f.readlines()
ML = []
for s in matlab_classes:
ML.append(float(s))
ML = np.asarray(ML)
The class ids obtained from MATLAB and from Theano are not the same, the formula below happens to convert them
In [86]:
new_cl = np.mod(ML+3,5)
In [87]:
new_cl
Out[87]:
In [88]:
plt.hist(new_cl,bins=5)
Out[88]:
Number of samples with the same class
In [89]:
np.sum(new_cl == cl)
Out[89]:
In [90]:
319./385.
Out[90]:
In [ ]: